# Load necessary libraries
library(dplyr)     # For data manipulation
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)     # For printing output nicely
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(tidyr)
library(visdat)
library(arrow)
## 
## Attaching package: 'arrow'
## The following object is masked from 'package:utils':
## 
##     timestamp
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:arrow':
## 
##     duration
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)   # Fo

# Read the CSV file
df_raw <- read.csv("Traffic_Crashes_-_Crashes_20240302.csv")

# Print column names
print(colnames(df_raw))
##  [1] "CRASH_RECORD_ID"               "CRASH_DATE_EST_I"             
##  [3] "CRASH_DATE"                    "POSTED_SPEED_LIMIT"           
##  [5] "TRAFFIC_CONTROL_DEVICE"        "DEVICE_CONDITION"             
##  [7] "WEATHER_CONDITION"             "LIGHTING_CONDITION"           
##  [9] "FIRST_CRASH_TYPE"              "TRAFFICWAY_TYPE"              
## [11] "LANE_CNT"                      "ALIGNMENT"                    
## [13] "ROADWAY_SURFACE_COND"          "ROAD_DEFECT"                  
## [15] "REPORT_TYPE"                   "CRASH_TYPE"                   
## [17] "INTERSECTION_RELATED_I"        "NOT_RIGHT_OF_WAY_I"           
## [19] "HIT_AND_RUN_I"                 "DAMAGE"                       
## [21] "DATE_POLICE_NOTIFIED"          "PRIM_CONTRIBUTORY_CAUSE"      
## [23] "SEC_CONTRIBUTORY_CAUSE"        "STREET_NO"                    
## [25] "STREET_DIRECTION"              "STREET_NAME"                  
## [27] "BEAT_OF_OCCURRENCE"            "PHOTOS_TAKEN_I"               
## [29] "STATEMENTS_TAKEN_I"            "DOORING_I"                    
## [31] "WORK_ZONE_I"                   "WORK_ZONE_TYPE"               
## [33] "WORKERS_PRESENT_I"             "NUM_UNITS"                    
## [35] "MOST_SEVERE_INJURY"            "INJURIES_TOTAL"               
## [37] "INJURIES_FATAL"                "INJURIES_INCAPACITATING"      
## [39] "INJURIES_NON_INCAPACITATING"   "INJURIES_REPORTED_NOT_EVIDENT"
## [41] "INJURIES_NO_INDICATION"        "INJURIES_UNKNOWN"             
## [43] "CRASH_HOUR"                    "CRASH_DAY_OF_WEEK"            
## [45] "CRASH_MONTH"                   "LATITUDE"                     
## [47] "LONGITUDE"                     "LOCATION"
# Print the number of rows and columns
cat("Number of rows:", nrow(df_raw), "\n")
## Number of rows: 810658
cat("Number of columns:", ncol(df_raw), "\n")
## Number of columns: 48
# Display information about the dataset
str(df_raw)
## 'data.frame':    810658 obs. of  48 variables:
##  $ CRASH_RECORD_ID              : chr  "6c1659069e9c6285a650e70d6f9b574ed5f64c12888479093dfeef179c0344ec6d2057eae224b5c0d5dfc278c0a237f8c22543f07fdef2e"| __truncated__ "5f54a59fcb087b12ae5b1acff96a3caf4f2d37e79f8db4106558b34b8a6d2b81af02cf91b576ecd7ced08ffd10fcfd940a84f7613125b89"| __truncated__ "61fcb8c1eb522a6469b460e2134df3d15f82e81fd93e9cafd3dc7e631b9e1ba8b450a63af12bd90d1d2d9b127ea287f88d32e138a4eeba1"| __truncated__ "004cd14d0303a9163aad69a2d7f341b7da2a8572b2ab3378594bfae8ac53dcb604dd8d414f93c290b55862f9f2517ad32e6209cbc8034c2"| __truncated__ ...
##  $ CRASH_DATE_EST_I             : chr  "" "" "" "" ...
##  $ CRASH_DATE                   : chr  "08/18/2023 12:50:00 PM" "07/29/2023 02:45:00 PM" "08/18/2023 05:58:00 PM" "11/26/2019 08:38:00 AM" ...
##  $ POSTED_SPEED_LIMIT           : int  15 30 30 25 20 30 30 35 30 25 ...
##  $ TRAFFIC_CONTROL_DEVICE       : chr  "OTHER" "TRAFFIC SIGNAL" "NO CONTROLS" "NO CONTROLS" ...
##  $ DEVICE_CONDITION             : chr  "FUNCTIONING PROPERLY" "FUNCTIONING PROPERLY" "NO CONTROLS" "NO CONTROLS" ...
##  $ WEATHER_CONDITION            : chr  "CLEAR" "CLEAR" "CLEAR" "CLEAR" ...
##  $ LIGHTING_CONDITION           : chr  "DAYLIGHT" "DAYLIGHT" "DAYLIGHT" "DAYLIGHT" ...
##  $ FIRST_CRASH_TYPE             : chr  "REAR END" "PARKED MOTOR VEHICLE" "PEDALCYCLIST" "PEDESTRIAN" ...
##  $ TRAFFICWAY_TYPE              : chr  "OTHER" "DIVIDED - W/MEDIAN (NOT RAISED)" "NOT DIVIDED" "ONE-WAY" ...
##  $ LANE_CNT                     : chr  "" "" "" "" ...
##  $ ALIGNMENT                    : chr  "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "CURVE ON GRADE" ...
##  $ ROADWAY_SURFACE_COND         : chr  "DRY" "DRY" "DRY" "DRY" ...
##  $ ROAD_DEFECT                  : chr  "NO DEFECTS" "NO DEFECTS" "NO DEFECTS" "NO DEFECTS" ...
##  $ REPORT_TYPE                  : chr  "ON SCENE" "ON SCENE" "ON SCENE" "ON SCENE" ...
##  $ CRASH_TYPE                   : chr  "INJURY AND / OR TOW DUE TO CRASH" "NO INJURY / DRIVE AWAY" "INJURY AND / OR TOW DUE TO CRASH" "INJURY AND / OR TOW DUE TO CRASH" ...
##  $ INTERSECTION_RELATED_I       : chr  "" "" "" "" ...
##  $ NOT_RIGHT_OF_WAY_I           : chr  "" "" "" "" ...
##  $ HIT_AND_RUN_I                : chr  "" "Y" "" "" ...
##  $ DAMAGE                       : chr  "OVER $1,500" "OVER $1,500" "$501 - $1,500" "OVER $1,500" ...
##  $ DATE_POLICE_NOTIFIED         : chr  "08/18/2023 12:55:00 PM" "07/29/2023 02:45:00 PM" "08/18/2023 06:01:00 PM" "11/26/2019 08:38:00 AM" ...
##  $ PRIM_CONTRIBUTORY_CAUSE      : chr  "FOLLOWING TOO CLOSELY" "FAILING TO REDUCE SPEED TO AVOID CRASH" "FAILING TO REDUCE SPEED TO AVOID CRASH" "UNABLE TO DETERMINE" ...
##  $ SEC_CONTRIBUTORY_CAUSE       : chr  "DISTRACTION - FROM INSIDE VEHICLE" "OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER" "UNABLE TO DETERMINE" "NOT APPLICABLE" ...
##  $ STREET_NO                    : int  700 2101 3422 5 3 1732 2 9000 5900 4546 ...
##  $ STREET_DIRECTION             : chr  "W" "S" "N" "W" ...
##  $ STREET_NAME                  : chr  "OHARE ST" "ASHLAND AVE" "LONG AVE" "TERMINAL ST" ...
##  $ BEAT_OF_OCCURRENCE           : int  1654 1235 1633 1655 1653 1814 1652 2221 232 1131 ...
##  $ PHOTOS_TAKEN_I               : chr  "" "" "" "Y" ...
##  $ STATEMENTS_TAKEN_I           : chr  "" "" "" "Y" ...
##  $ DOORING_I                    : chr  "" "" "" "" ...
##  $ WORK_ZONE_I                  : chr  "" "" "" "" ...
##  $ WORK_ZONE_TYPE               : chr  "" "" "" "" ...
##  $ WORKERS_PRESENT_I            : chr  "" "" "" "" ...
##  $ NUM_UNITS                    : int  2 4 2 2 1 2 2 2 2 2 ...
##  $ MOST_SEVERE_INJURY           : chr  "NONINCAPACITATING INJURY" "NO INDICATION OF INJURY" "NONINCAPACITATING INJURY" "FATAL" ...
##  $ INJURIES_TOTAL               : int  1 0 1 1 0 0 0 0 0 0 ...
##  $ INJURIES_FATAL               : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ INJURIES_INCAPACITATING      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ INJURIES_NON_INCAPACITATING  : int  1 0 1 0 0 0 0 0 0 0 ...
##  $ INJURIES_REPORTED_NOT_EVIDENT: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ INJURIES_NO_INDICATION       : int  1 1 1 1 1 3 2 2 2 5 ...
##  $ INJURIES_UNKNOWN             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRASH_HOUR                   : int  12 14 17 8 10 13 17 13 0 19 ...
##  $ CRASH_DAY_OF_WEEK            : int  6 7 6 3 6 7 2 1 1 2 ...
##  $ CRASH_MONTH                  : int  8 7 8 11 8 7 2 8 8 1 ...
##  $ LATITUDE                     : num  NA 41.9 41.9 NA NA ...
##  $ LONGITUDE                    : num  NA -87.7 -87.8 NA NA ...
##  $ LOCATION                     : chr  "" "POINT (-87.665902342962 41.854120262952)" "POINT (-87.761883496974 41.942975745006)" "" ...
# Rename columns
df_1 <- df_raw %>%
  rename(inj_non_incap = INJURIES_NON_INCAPACITATING,
         inj_report_not_evdnt = INJURIES_REPORTED_NOT_EVIDENT)

# Drop unwanted columns
df_1 <- df_raw %>%
  select(-CRASH_RECORD_ID, -CRASH_DATE_EST_I, -LANE_CNT, -LOCATION, -REPORT_TYPE, -DATE_POLICE_NOTIFIED,
         -PHOTOS_TAKEN_I, -STATEMENTS_TAKEN_I, -DOORING_I, -WORK_ZONE_I, -WORK_ZONE_TYPE, -WORKERS_PRESENT_I)

# Assuming df_1 is your dataframe
df_c <- df_1
colnames(df_c)
##  [1] "CRASH_DATE"                    "POSTED_SPEED_LIMIT"           
##  [3] "TRAFFIC_CONTROL_DEVICE"        "DEVICE_CONDITION"             
##  [5] "WEATHER_CONDITION"             "LIGHTING_CONDITION"           
##  [7] "FIRST_CRASH_TYPE"              "TRAFFICWAY_TYPE"              
##  [9] "ALIGNMENT"                     "ROADWAY_SURFACE_COND"         
## [11] "ROAD_DEFECT"                   "CRASH_TYPE"                   
## [13] "INTERSECTION_RELATED_I"        "NOT_RIGHT_OF_WAY_I"           
## [15] "HIT_AND_RUN_I"                 "DAMAGE"                       
## [17] "PRIM_CONTRIBUTORY_CAUSE"       "SEC_CONTRIBUTORY_CAUSE"       
## [19] "STREET_NO"                     "STREET_DIRECTION"             
## [21] "STREET_NAME"                   "BEAT_OF_OCCURRENCE"           
## [23] "NUM_UNITS"                     "MOST_SEVERE_INJURY"           
## [25] "INJURIES_TOTAL"                "INJURIES_FATAL"               
## [27] "INJURIES_INCAPACITATING"       "INJURIES_NON_INCAPACITATING"  
## [29] "INJURIES_REPORTED_NOT_EVIDENT" "INJURIES_NO_INDICATION"       
## [31] "INJURIES_UNKNOWN"              "CRASH_HOUR"                   
## [33] "CRASH_DAY_OF_WEEK"             "CRASH_MONTH"                  
## [35] "LATITUDE"                      "LONGITUDE"
# Assuming df_c is your dataframe
df_c <- df_c %>%
  mutate(INTERSECTION_RELATED_I = if_else(is.na(INTERSECTION_RELATED_I), 'N', INTERSECTION_RELATED_I),
         NOT_RIGHT_OF_WAY_I = if_else(is.na(NOT_RIGHT_OF_WAY_I), 'N', NOT_RIGHT_OF_WAY_I),
         HIT_AND_RUN_I = if_else(is.na(HIT_AND_RUN_I), 'N', HIT_AND_RUN_I))

# Set columns to 0 where injuries_total is missing and crash_type is 'NO INJURY / DRIVE AWAY'
df_c <- df_c %>%
  mutate(INJURIES_TOTAL = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_TOTAL),
         INJURIES_FATAL = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_FATAL),
         INJURIES_INCAPACITATING = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_INCAPACITATING),
         INJURIES_NON_INCAPACITATING = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_NON_INCAPACITATING),
         INJURIES_REPORTED_NOT_EVIDENT = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_REPORTED_NOT_EVIDENT),
         INJURIES_NO_INDICATION = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_NO_INDICATION),
         INJURIES_UNKNOWN = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_UNKNOWN))

# Drop rows where injuries_total is missing
df_c <- df_c %>%
  filter(!is.na(INJURIES_TOTAL))

# wherever most_severe_injury is missing and injuries_total is 0
df_c <- df_c %>%
  mutate(MOST_SEVERE_INJURY = ifelse(is.na(MOST_SEVERE_INJURY) & INJURIES_TOTAL == 0,
                                     'NO INDICATION OF INJURY', MOST_SEVERE_INJURY))

# Print table of MOST_SEVERE_INJURY values
print(as.data.frame(table(df_c$MOST_SEVERE_INJURY, useNA = "ifany")))
##                       Var1   Freq
## 1                            1007
## 2                    FATAL    889
## 3    INCAPACITATING INJURY  13731
## 4  NO INDICATION OF INJURY 697464
## 5 NONINCAPACITATING INJURY  62887
## 6    REPORTED, NOT EVIDENT  33906
ggplot(df_c, aes(x = MOST_SEVERE_INJURY)) +
  geom_bar() +  # Add bars
  labs(x = "MOST_SEVERE_INJURY", y = "Number of Records", title = "Number of Records per Category") +  # Add labels and title
  theme_minimal() +  # Minimal theme
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

# Convert CRASH_DATE to datetime and create crash_year column
df_c <- df_c %>%
  mutate(CRASH_DATE = mdy_hms(CRASH_DATE),
         crash_year = year(CRASH_DATE))

# Create address column
df_c <- df_c %>%
  mutate(
    STREET_NO = as.character(STREET_NO),
    STREET_DIRECTION = as.character(STREET_DIRECTION),
    STREET_NAME = as.character(STREET_NAME),
    address = paste0(STREET_NO, " ", STREET_DIRECTION, " ", STREET_NAME)
  )

# Assuming df_c is your dataframe
df_c2 <- df_c

# Using base R unique function to drop duplicates
df_c2 <- unique(df_c)

# Using dplyr's distinct function
df_c2 <- distinct(df_c)

# Drop rows that have both latitude and longitude == 0
df_c2 <- df_c2 %>%
  filter(LATITUDE != 0 & LONGITUDE != 0)

# Convert columns to integer type
df_c2 <- df_c2 %>%
  mutate(
    BEAT_OF_OCCURRENCE = as.integer(BEAT_OF_OCCURRENCE),
    INJURIES_TOTAL = as.integer(INJURIES_TOTAL),
    INJURIES_FATAL = as.integer(INJURIES_FATAL),
    INJURIES_INCAPACITATING = as.integer(INJURIES_INCAPACITATING),
    INJURIES_NON_INCAPACITATING = as.integer(INJURIES_NON_INCAPACITATING),
    INJURIES_REPORTED_NOT_EVIDENT = as.integer(INJURIES_REPORTED_NOT_EVIDENT),
    INJURIES_NO_INDICATION = as.integer(INJURIES_NO_INDICATION)
  )

# Drop columns "not_right_of_way_i" and "injuries_unknown"
df_c2 <- df_c2 %>%
  select(-NOT_RIGHT_OF_WAY_I, -INJURIES_UNKNOWN)

# Inspect the structure of the dataframe
str(df_c2)
## 'data.frame':    804227 obs. of  36 variables:
##  $ CRASH_DATE                   : POSIXct, format: "2023-07-29 14:45:00" "2023-08-18 17:58:00" ...
##  $ POSTED_SPEED_LIMIT           : int  30 30 10 30 15 30 30 30 30 20 ...
##  $ TRAFFIC_CONTROL_DEVICE       : chr  "TRAFFIC SIGNAL" "NO CONTROLS" "NO CONTROLS" "TRAFFIC SIGNAL" ...
##  $ DEVICE_CONDITION             : chr  "FUNCTIONING PROPERLY" "NO CONTROLS" "NO CONTROLS" "FUNCTIONING PROPERLY" ...
##  $ WEATHER_CONDITION            : chr  "CLEAR" "CLEAR" "UNKNOWN" "CLEAR" ...
##  $ LIGHTING_CONDITION           : chr  "DAYLIGHT" "DAYLIGHT" "UNKNOWN" "DARKNESS" ...
##  $ FIRST_CRASH_TYPE             : chr  "PARKED MOTOR VEHICLE" "PEDALCYCLIST" "ANGLE" "SIDESWIPE OPPOSITE DIRECTION" ...
##  $ TRAFFICWAY_TYPE              : chr  "DIVIDED - W/MEDIAN (NOT RAISED)" "NOT DIVIDED" "PARKING LOT" "NOT DIVIDED" ...
##  $ ALIGNMENT                    : chr  "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" ...
##  $ ROADWAY_SURFACE_COND         : chr  "DRY" "DRY" "UNKNOWN" "UNKNOWN" ...
##  $ ROAD_DEFECT                  : chr  "NO DEFECTS" "NO DEFECTS" "UNKNOWN" "UNKNOWN" ...
##  $ CRASH_TYPE                   : chr  "NO INJURY / DRIVE AWAY" "INJURY AND / OR TOW DUE TO CRASH" "NO INJURY / DRIVE AWAY" "NO INJURY / DRIVE AWAY" ...
##  $ INTERSECTION_RELATED_I       : chr  "" "" "" "" ...
##  $ HIT_AND_RUN_I                : chr  "Y" "" "Y" "Y" ...
##  $ DAMAGE                       : chr  "OVER $1,500" "$501 - $1,500" "$501 - $1,500" "OVER $1,500" ...
##  $ PRIM_CONTRIBUTORY_CAUSE      : chr  "FAILING TO REDUCE SPEED TO AVOID CRASH" "FAILING TO REDUCE SPEED TO AVOID CRASH" "UNABLE TO DETERMINE" "UNABLE TO DETERMINE" ...
##  $ SEC_CONTRIBUTORY_CAUSE       : chr  "OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER" "UNABLE TO DETERMINE" "NOT APPLICABLE" "NOT APPLICABLE" ...
##  $ STREET_NO                    : chr  "2101" "3422" "1320" "999" ...
##  $ STREET_DIRECTION             : chr  "S" "N" "E" "N" ...
##  $ STREET_NAME                  : chr  "ASHLAND AVE" "LONG AVE" "47TH ST" "CALIFORNIA AVE" ...
##  $ BEAT_OF_OCCURRENCE           : int  1235 1633 222 1211 412 731 1712 1811 1214 2525 ...
##  $ NUM_UNITS                    : int  4 2 2 2 2 2 3 2 2 2 ...
##  $ MOST_SEVERE_INJURY           : chr  "NO INDICATION OF INJURY" "NONINCAPACITATING INJURY" "NO INDICATION OF INJURY" "NO INDICATION OF INJURY" ...
##  $ INJURIES_TOTAL               : int  0 1 0 0 0 5 0 0 0 0 ...
##  $ INJURIES_FATAL               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ INJURIES_INCAPACITATING      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ INJURIES_NON_INCAPACITATING  : int  0 1 0 0 0 5 0 0 0 0 ...
##  $ INJURIES_REPORTED_NOT_EVIDENT: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ INJURIES_NO_INDICATION       : int  1 1 2 2 2 0 4 2 3 1 ...
##  $ CRASH_HOUR                   : int  14 17 14 0 12 19 10 18 14 12 ...
##  $ CRASH_DAY_OF_WEEK            : int  7 6 7 7 4 4 1 4 7 1 ...
##  $ CRASH_MONTH                  : int  7 8 7 7 9 8 8 9 8 7 ...
##  $ LATITUDE                     : num  41.9 41.9 41.8 41.9 41.7 ...
##  $ LONGITUDE                    : num  -87.7 -87.8 -87.6 -87.7 -87.6 ...
##  $ crash_year                   : num  2023 2023 2023 2023 2023 ...
##  $ address                      : chr  "2101 S ASHLAND AVE" "3422 N LONG AVE" "1320 E 47TH ST" "999 N CALIFORNIA AVE" ...
#write_parquet(df_c2, file_parquet_c)
write_parquet(df_c2, "Intmd_data.parquet")


# Read Parquet file into a dataframe
crash_df <- arrow::read_parquet("Intmd_data.parquet")

# Inspect the structure of the dataframe
str(crash_df)
## tibble [804,227 × 36] (S3: tbl_df/tbl/data.frame)
##  $ CRASH_DATE                   : POSIXct[1:804227], format: "2023-07-29 14:45:00" "2023-08-18 17:58:00" ...
##  $ POSTED_SPEED_LIMIT           : int [1:804227] 30 30 10 30 15 30 30 30 30 20 ...
##  $ TRAFFIC_CONTROL_DEVICE       : chr [1:804227] "TRAFFIC SIGNAL" "NO CONTROLS" "NO CONTROLS" "TRAFFIC SIGNAL" ...
##  $ DEVICE_CONDITION             : chr [1:804227] "FUNCTIONING PROPERLY" "NO CONTROLS" "NO CONTROLS" "FUNCTIONING PROPERLY" ...
##  $ WEATHER_CONDITION            : chr [1:804227] "CLEAR" "CLEAR" "UNKNOWN" "CLEAR" ...
##  $ LIGHTING_CONDITION           : chr [1:804227] "DAYLIGHT" "DAYLIGHT" "UNKNOWN" "DARKNESS" ...
##  $ FIRST_CRASH_TYPE             : chr [1:804227] "PARKED MOTOR VEHICLE" "PEDALCYCLIST" "ANGLE" "SIDESWIPE OPPOSITE DIRECTION" ...
##  $ TRAFFICWAY_TYPE              : chr [1:804227] "DIVIDED - W/MEDIAN (NOT RAISED)" "NOT DIVIDED" "PARKING LOT" "NOT DIVIDED" ...
##  $ ALIGNMENT                    : chr [1:804227] "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" ...
##  $ ROADWAY_SURFACE_COND         : chr [1:804227] "DRY" "DRY" "UNKNOWN" "UNKNOWN" ...
##  $ ROAD_DEFECT                  : chr [1:804227] "NO DEFECTS" "NO DEFECTS" "UNKNOWN" "UNKNOWN" ...
##  $ CRASH_TYPE                   : chr [1:804227] "NO INJURY / DRIVE AWAY" "INJURY AND / OR TOW DUE TO CRASH" "NO INJURY / DRIVE AWAY" "NO INJURY / DRIVE AWAY" ...
##  $ INTERSECTION_RELATED_I       : chr [1:804227] "" "" "" "" ...
##  $ HIT_AND_RUN_I                : chr [1:804227] "Y" "" "Y" "Y" ...
##  $ DAMAGE                       : chr [1:804227] "OVER $1,500" "$501 - $1,500" "$501 - $1,500" "OVER $1,500" ...
##  $ PRIM_CONTRIBUTORY_CAUSE      : chr [1:804227] "FAILING TO REDUCE SPEED TO AVOID CRASH" "FAILING TO REDUCE SPEED TO AVOID CRASH" "UNABLE TO DETERMINE" "UNABLE TO DETERMINE" ...
##  $ SEC_CONTRIBUTORY_CAUSE       : chr [1:804227] "OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER" "UNABLE TO DETERMINE" "NOT APPLICABLE" "NOT APPLICABLE" ...
##  $ STREET_NO                    : chr [1:804227] "2101" "3422" "1320" "999" ...
##  $ STREET_DIRECTION             : chr [1:804227] "S" "N" "E" "N" ...
##  $ STREET_NAME                  : chr [1:804227] "ASHLAND AVE" "LONG AVE" "47TH ST" "CALIFORNIA AVE" ...
##  $ BEAT_OF_OCCURRENCE           : int [1:804227] 1235 1633 222 1211 412 731 1712 1811 1214 2525 ...
##  $ NUM_UNITS                    : int [1:804227] 4 2 2 2 2 2 3 2 2 2 ...
##  $ MOST_SEVERE_INJURY           : chr [1:804227] "NO INDICATION OF INJURY" "NONINCAPACITATING INJURY" "NO INDICATION OF INJURY" "NO INDICATION OF INJURY" ...
##  $ INJURIES_TOTAL               : int [1:804227] 0 1 0 0 0 5 0 0 0 0 ...
##  $ INJURIES_FATAL               : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
##  $ INJURIES_INCAPACITATING      : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
##  $ INJURIES_NON_INCAPACITATING  : int [1:804227] 0 1 0 0 0 5 0 0 0 0 ...
##  $ INJURIES_REPORTED_NOT_EVIDENT: int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
##  $ INJURIES_NO_INDICATION       : int [1:804227] 1 1 2 2 2 0 4 2 3 1 ...
##  $ CRASH_HOUR                   : int [1:804227] 14 17 14 0 12 19 10 18 14 12 ...
##  $ CRASH_DAY_OF_WEEK            : int [1:804227] 7 6 7 7 4 4 1 4 7 1 ...
##  $ CRASH_MONTH                  : int [1:804227] 7 8 7 7 9 8 8 9 8 7 ...
##  $ LATITUDE                     : num [1:804227] 41.9 41.9 41.8 41.9 41.7 ...
##  $ LONGITUDE                    : num [1:804227] -87.7 -87.8 -87.6 -87.7 -87.6 ...
##  $ crash_year                   : num [1:804227] 2023 2023 2023 2023 2023 ...
##  $ address                      : chr [1:804227] "2101 S ASHLAND AVE" "3422 N LONG AVE" "1320 E 47TH ST" "999 N CALIFORNIA AVE" ...
# Rounding at mid-point to nearest round_unit
round_unit <- 5
crash_df$POSTED_SPEED_LIMIT <- (crash_df$POSTED_SPEED_LIMIT %/% round_unit * round_unit) + round((crash_df$POSTED_SPEED_LIMIT %% round_unit) / round_unit) * round_unit

# Counting the occurrences of each rounded value
table(crash_df$POSTED_SPEED_LIMIT)
## 
##      0      5     10     15     20     25     30     35     40     45     50 
##   7455   4831  18706  28251  32902  50705 593652  53804   7741   5191    218 
##     55     60     65     70    100 
##    642     43     15      5     66
# Create 'has_injuries' column based on injuries_total
crash_df$has_injuries <- as.integer(crash_df$INJURIES_TOTAL > 0)

# Count the occurrences of each unique value in 'has_injuries' column
table(crash_df$has_injuries)
## 
##      0      1 
## 693524 110703
# Create 'has_fatal' column based on injuries_fatal
crash_df$has_fatal <- as.integer(crash_df$INJURIES_FATAL > 0)

# Count the occurrences of each unique value in 'has_fatal' column
table(crash_df$has_fatal)
## 
##      0      1 
## 802361    883
# Create 'crash_time_of_day' column using cut function
crash_df$crash_time_of_day <- cut(crash_df$CRASH_HOUR, breaks = c(-Inf, 6, 12, 18, Inf), labels = c("overnight", "morning", "mid_day", "evening"))

# Count the occurrences of each category in 'crash_time_of_day' column
table(crash_df$crash_time_of_day)
## 
## overnight   morning   mid_day   evening 
##     93200    238054    335923    137050
# Define list of feature names
features_names <- c(
  'crash_date',
  'crash_year',
  'crash_month',
  'crash_day_of_week',
  'crash_hour',
  'crash_time_of_day', # New
  'latitude',
  'longitude',
  'beat_of_occurrence',
  'address',
  'street_no',
  'street_direction',
  'street_name',
  'posted_speed_limit',
  'traffic_control_device',
  'device_condition',
  'weather_condition',
  'lighting_condition',
  'trafficway_type',
  'alignment',
  'roadway_surface_cond',
  'road_defect',
  'first_crash_type',
  'prim_contributory_cause',
  'sec_contributory_cause',
  'num_units'
)

# Define list of target names
target_names <- c(
  'has_injuries',  # New
  'has_fatal',  # New
  'crash_type',
  'damage',
  'injuries_total',
  'injuries_fatal'
)

# Combine feature and target names
all_columns <- c(features_names, target_names)

colnames(crash_df)
##  [1] "CRASH_DATE"                    "POSTED_SPEED_LIMIT"           
##  [3] "TRAFFIC_CONTROL_DEVICE"        "DEVICE_CONDITION"             
##  [5] "WEATHER_CONDITION"             "LIGHTING_CONDITION"           
##  [7] "FIRST_CRASH_TYPE"              "TRAFFICWAY_TYPE"              
##  [9] "ALIGNMENT"                     "ROADWAY_SURFACE_COND"         
## [11] "ROAD_DEFECT"                   "CRASH_TYPE"                   
## [13] "INTERSECTION_RELATED_I"        "HIT_AND_RUN_I"                
## [15] "DAMAGE"                        "PRIM_CONTRIBUTORY_CAUSE"      
## [17] "SEC_CONTRIBUTORY_CAUSE"        "STREET_NO"                    
## [19] "STREET_DIRECTION"              "STREET_NAME"                  
## [21] "BEAT_OF_OCCURRENCE"            "NUM_UNITS"                    
## [23] "MOST_SEVERE_INJURY"            "INJURIES_TOTAL"               
## [25] "INJURIES_FATAL"                "INJURIES_INCAPACITATING"      
## [27] "INJURIES_NON_INCAPACITATING"   "INJURIES_REPORTED_NOT_EVIDENT"
## [29] "INJURIES_NO_INDICATION"        "CRASH_HOUR"                   
## [31] "CRASH_DAY_OF_WEEK"             "CRASH_MONTH"                  
## [33] "LATITUDE"                      "LONGITUDE"                    
## [35] "crash_year"                    "address"                      
## [37] "has_injuries"                  "has_fatal"                    
## [39] "crash_time_of_day"
# Assuming df_c is your dataframe
names(crash_df) <- tolower(names(crash_df))
colnames(crash_df)
##  [1] "crash_date"                    "posted_speed_limit"           
##  [3] "traffic_control_device"        "device_condition"             
##  [5] "weather_condition"             "lighting_condition"           
##  [7] "first_crash_type"              "trafficway_type"              
##  [9] "alignment"                     "roadway_surface_cond"         
## [11] "road_defect"                   "crash_type"                   
## [13] "intersection_related_i"        "hit_and_run_i"                
## [15] "damage"                        "prim_contributory_cause"      
## [17] "sec_contributory_cause"        "street_no"                    
## [19] "street_direction"              "street_name"                  
## [21] "beat_of_occurrence"            "num_units"                    
## [23] "most_severe_injury"            "injuries_total"               
## [25] "injuries_fatal"                "injuries_incapacitating"      
## [27] "injuries_non_incapacitating"   "injuries_reported_not_evident"
## [29] "injuries_no_indication"        "crash_hour"                   
## [31] "crash_day_of_week"             "crash_month"                  
## [33] "latitude"                      "longitude"                    
## [35] "crash_year"                    "address"                      
## [37] "has_injuries"                  "has_fatal"                    
## [39] "crash_time_of_day"
file_crash_df_parquet <- "crash_df.parquet"

# Write dataframe to Parquet format
write_parquet(crash_df[all_columns], file_crash_df_parquet)

# Replace crash_df with your data frame variable name and all_columns with the columns you want to include
write_parquet(crash_df[, all_columns], "file_crash_df.parquet")

# Replace file_crash_df_parquet with the path to your Parquet file
crash_df <- read_parquet("file_crash_df.parquet")

# Replace file_crash_df_parquet with the path to your Parquet file
crash_df <- read_parquet("file_crash_df.parquet")

# Replace crash_df with your data frame variable name and target_names with the names of the columns you want to select
crash_targets <- crash_df %>%
  select(target_names)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(target_names)
## 
##   # Now:
##   data %>% select(all_of(target_names))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print information about the selected columns
str(crash_targets)
## tibble [804,227 × 6] (S3: tbl_df/tbl/data.frame)
##  $ has_injuries  : int [1:804227] 0 1 0 0 0 1 0 0 0 0 ...
##  $ has_fatal     : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
##  $ crash_type    : chr [1:804227] "NO INJURY / DRIVE AWAY" "INJURY AND / OR TOW DUE TO CRASH" "NO INJURY / DRIVE AWAY" "NO INJURY / DRIVE AWAY" ...
##  $ damage        : chr [1:804227] "OVER $1,500" "$501 - $1,500" "$501 - $1,500" "OVER $1,500" ...
##  $ injuries_total: int [1:804227] 0 1 0 0 0 5 0 0 0 0 ...
##  $ injuries_fatal: int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
str(crash_df)
## tibble [804,227 × 32] (S3: tbl_df/tbl/data.frame)
##  $ crash_date             : POSIXct[1:804227], format: "2023-07-29 14:45:00" "2023-08-18 17:58:00" ...
##  $ crash_year             : num [1:804227] 2023 2023 2023 2023 2023 ...
##  $ crash_month            : int [1:804227] 7 8 7 7 9 8 8 9 8 7 ...
##  $ crash_day_of_week      : int [1:804227] 7 6 7 7 4 4 1 4 7 1 ...
##  $ crash_hour             : int [1:804227] 14 17 14 0 12 19 10 18 14 12 ...
##  $ crash_time_of_day      : Factor w/ 4 levels "overnight","morning",..: 3 3 3 1 2 4 2 3 3 2 ...
##  $ latitude               : num [1:804227] 41.9 41.9 41.8 41.9 41.7 ...
##  $ longitude              : num [1:804227] -87.7 -87.8 -87.6 -87.7 -87.6 ...
##  $ beat_of_occurrence     : int [1:804227] 1235 1633 222 1211 412 731 1712 1811 1214 2525 ...
##  $ address                : chr [1:804227] "2101 S ASHLAND AVE" "3422 N LONG AVE" "1320 E 47TH ST" "999 N CALIFORNIA AVE" ...
##  $ street_no              : chr [1:804227] "2101" "3422" "1320" "999" ...
##  $ street_direction       : chr [1:804227] "S" "N" "E" "N" ...
##  $ street_name            : chr [1:804227] "ASHLAND AVE" "LONG AVE" "47TH ST" "CALIFORNIA AVE" ...
##  $ posted_speed_limit     : num [1:804227] 30 30 10 30 15 30 30 30 30 20 ...
##  $ traffic_control_device : chr [1:804227] "TRAFFIC SIGNAL" "NO CONTROLS" "NO CONTROLS" "TRAFFIC SIGNAL" ...
##  $ device_condition       : chr [1:804227] "FUNCTIONING PROPERLY" "NO CONTROLS" "NO CONTROLS" "FUNCTIONING PROPERLY" ...
##  $ weather_condition      : chr [1:804227] "CLEAR" "CLEAR" "UNKNOWN" "CLEAR" ...
##  $ lighting_condition     : chr [1:804227] "DAYLIGHT" "DAYLIGHT" "UNKNOWN" "DARKNESS" ...
##  $ trafficway_type        : chr [1:804227] "DIVIDED - W/MEDIAN (NOT RAISED)" "NOT DIVIDED" "PARKING LOT" "NOT DIVIDED" ...
##  $ alignment              : chr [1:804227] "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" ...
##  $ roadway_surface_cond   : chr [1:804227] "DRY" "DRY" "UNKNOWN" "UNKNOWN" ...
##  $ road_defect            : chr [1:804227] "NO DEFECTS" "NO DEFECTS" "UNKNOWN" "UNKNOWN" ...
##  $ first_crash_type       : chr [1:804227] "PARKED MOTOR VEHICLE" "PEDALCYCLIST" "ANGLE" "SIDESWIPE OPPOSITE DIRECTION" ...
##  $ prim_contributory_cause: chr [1:804227] "FAILING TO REDUCE SPEED TO AVOID CRASH" "FAILING TO REDUCE SPEED TO AVOID CRASH" "UNABLE TO DETERMINE" "UNABLE TO DETERMINE" ...
##  $ sec_contributory_cause : chr [1:804227] "OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER" "UNABLE TO DETERMINE" "NOT APPLICABLE" "NOT APPLICABLE" ...
##  $ num_units              : int [1:804227] 4 2 2 2 2 2 3 2 2 2 ...
##  $ has_injuries           : int [1:804227] 0 1 0 0 0 1 0 0 0 0 ...
##  $ has_fatal              : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
##  $ crash_type             : chr [1:804227] "NO INJURY / DRIVE AWAY" "INJURY AND / OR TOW DUE TO CRASH" "NO INJURY / DRIVE AWAY" "NO INJURY / DRIVE AWAY" ...
##  $ damage                 : chr [1:804227] "OVER $1,500" "$501 - $1,500" "$501 - $1,500" "OVER $1,500" ...
##  $ injuries_total         : int [1:804227] 0 1 0 0 0 5 0 0 0 0 ...
##  $ injuries_fatal         : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
# File path to the Parquet file
file_crash_df_parquet <- "crash_df.parquet"

# Read the Parquet file into a dataframe
crash_df <- arrow::read_parquet(file_crash_df_parquet)

# Print information about the dataframe
str(crash_df)
## tibble [804,227 × 32] (S3: tbl_df/tbl/data.frame)
##  $ crash_date             : POSIXct[1:804227], format: "2023-07-29 14:45:00" "2023-08-18 17:58:00" ...
##  $ crash_year             : num [1:804227] 2023 2023 2023 2023 2023 ...
##  $ crash_month            : int [1:804227] 7 8 7 7 9 8 8 9 8 7 ...
##  $ crash_day_of_week      : int [1:804227] 7 6 7 7 4 4 1 4 7 1 ...
##  $ crash_hour             : int [1:804227] 14 17 14 0 12 19 10 18 14 12 ...
##  $ crash_time_of_day      : Factor w/ 4 levels "overnight","morning",..: 3 3 3 1 2 4 2 3 3 2 ...
##  $ latitude               : num [1:804227] 41.9 41.9 41.8 41.9 41.7 ...
##  $ longitude              : num [1:804227] -87.7 -87.8 -87.6 -87.7 -87.6 ...
##  $ beat_of_occurrence     : int [1:804227] 1235 1633 222 1211 412 731 1712 1811 1214 2525 ...
##  $ address                : chr [1:804227] "2101 S ASHLAND AVE" "3422 N LONG AVE" "1320 E 47TH ST" "999 N CALIFORNIA AVE" ...
##  $ street_no              : chr [1:804227] "2101" "3422" "1320" "999" ...
##  $ street_direction       : chr [1:804227] "S" "N" "E" "N" ...
##  $ street_name            : chr [1:804227] "ASHLAND AVE" "LONG AVE" "47TH ST" "CALIFORNIA AVE" ...
##  $ posted_speed_limit     : num [1:804227] 30 30 10 30 15 30 30 30 30 20 ...
##  $ traffic_control_device : chr [1:804227] "TRAFFIC SIGNAL" "NO CONTROLS" "NO CONTROLS" "TRAFFIC SIGNAL" ...
##  $ device_condition       : chr [1:804227] "FUNCTIONING PROPERLY" "NO CONTROLS" "NO CONTROLS" "FUNCTIONING PROPERLY" ...
##  $ weather_condition      : chr [1:804227] "CLEAR" "CLEAR" "UNKNOWN" "CLEAR" ...
##  $ lighting_condition     : chr [1:804227] "DAYLIGHT" "DAYLIGHT" "UNKNOWN" "DARKNESS" ...
##  $ trafficway_type        : chr [1:804227] "DIVIDED - W/MEDIAN (NOT RAISED)" "NOT DIVIDED" "PARKING LOT" "NOT DIVIDED" ...
##  $ alignment              : chr [1:804227] "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" ...
##  $ roadway_surface_cond   : chr [1:804227] "DRY" "DRY" "UNKNOWN" "UNKNOWN" ...
##  $ road_defect            : chr [1:804227] "NO DEFECTS" "NO DEFECTS" "UNKNOWN" "UNKNOWN" ...
##  $ first_crash_type       : chr [1:804227] "PARKED MOTOR VEHICLE" "PEDALCYCLIST" "ANGLE" "SIDESWIPE OPPOSITE DIRECTION" ...
##  $ prim_contributory_cause: chr [1:804227] "FAILING TO REDUCE SPEED TO AVOID CRASH" "FAILING TO REDUCE SPEED TO AVOID CRASH" "UNABLE TO DETERMINE" "UNABLE TO DETERMINE" ...
##  $ sec_contributory_cause : chr [1:804227] "OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER" "UNABLE TO DETERMINE" "NOT APPLICABLE" "NOT APPLICABLE" ...
##  $ num_units              : int [1:804227] 4 2 2 2 2 2 3 2 2 2 ...
##  $ has_injuries           : int [1:804227] 0 1 0 0 0 1 0 0 0 0 ...
##  $ has_fatal              : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
##  $ crash_type             : chr [1:804227] "NO INJURY / DRIVE AWAY" "INJURY AND / OR TOW DUE TO CRASH" "NO INJURY / DRIVE AWAY" "NO INJURY / DRIVE AWAY" ...
##  $ damage                 : chr [1:804227] "OVER $1,500" "$501 - $1,500" "$501 - $1,500" "OVER $1,500" ...
##  $ injuries_total         : int [1:804227] 0 1 0 0 0 5 0 0 0 0 ...
##  $ injuries_fatal         : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
# Subset the data where crash_year > 2017 and crash_year < 2021
subset_df <- subset(crash_df, crash_year > 2017 & crash_year < 2021)

# Print the subsetted dataframe
head(subset_df)
## # A tibble: 6 × 32
##   crash_date          crash_year crash_month crash_day_of_week crash_hour
##   <dttm>                   <dbl>       <int>             <int>      <int>
## 1 2019-08-18 19:27:00       2019           8                 1         19
## 2 2018-07-27 10:18:00       2018           7                 6         10
## 3 2020-05-22 04:00:00       2020           5                 6          4
## 4 2019-06-11 08:40:00       2019           6                 3          8
## 5 2020-01-07 05:45:00       2020           1                 3          5
## 6 2019-03-17 01:53:00       2019           3                 1          1
## # ℹ 27 more variables: crash_time_of_day <fct>, latitude <dbl>,
## #   longitude <dbl>, beat_of_occurrence <int>, address <chr>, street_no <chr>,
## #   street_direction <chr>, street_name <chr>, posted_speed_limit <dbl>,
## #   traffic_control_device <chr>, device_condition <chr>,
## #   weather_condition <chr>, lighting_condition <chr>, trafficway_type <chr>,
## #   alignment <chr>, roadway_surface_cond <chr>, road_defect <chr>,
## #   first_crash_type <chr>, prim_contributory_cause <chr>, …
# Filter rows based on the condition (crash_year > 2017 & crash_year < 2021)
crash_df <- crash_df %>%
  filter(crash_year > 2017 & crash_year < 2021)

# Retrieve column names of the dataframe crash_df
column_names <- colnames(crash_df)
column_names
##  [1] "crash_date"              "crash_year"             
##  [3] "crash_month"             "crash_day_of_week"      
##  [5] "crash_hour"              "crash_time_of_day"      
##  [7] "latitude"                "longitude"              
##  [9] "beat_of_occurrence"      "address"                
## [11] "street_no"               "street_direction"       
## [13] "street_name"             "posted_speed_limit"     
## [15] "traffic_control_device"  "device_condition"       
## [17] "weather_condition"       "lighting_condition"     
## [19] "trafficway_type"         "alignment"              
## [21] "roadway_surface_cond"    "road_defect"            
## [23] "first_crash_type"        "prim_contributory_cause"
## [25] "sec_contributory_cause"  "num_units"              
## [27] "has_injuries"            "has_fatal"              
## [29] "crash_type"              "damage"                 
## [31] "injuries_total"          "injuries_fatal"
nrow(crash_df)
## [1] 326469
table(crash_df$crash_year)
## 
##   2018   2019   2020 
## 118377 116736  91356
features_names <- c(
  'crash_date',
  'crash_year',
  'crash_month',
  'crash_day_of_week',
  'crash_hour',
  'crash_time_of_day', # New
  'latitude',
  'longitude',
  'beat_of_occurrence',
  'address',
  #    'street_no',
  #    'street_direction',
  #    'street_name',
  'posted_speed_limit',
  'traffic_control_device',
  'device_condition',
  'weather_condition',
  'lighting_condition',
  'trafficway_type',
  'alignment',
  'roadway_surface_cond',
  'road_defect',
  'first_crash_type',
  'prim_contributory_cause',
  'sec_contributory_cause',
  'num_units'
)

target_names <- c(
  'has_injuries',  # New
  'has_fatal',  # New
  'crash_type',
  'damage',
  'injuries_total',
  'injuries_fatal'
  #    'injuries_incapacitating',
  #    'inj_non_incap',
  #    'inj_report_not_evdnt',
  #    'injuries_no_indication',
  #    'most_severe_injury'
)

all_columns <- c(features_names, target_names)
all_columns
##  [1] "crash_date"              "crash_year"             
##  [3] "crash_month"             "crash_day_of_week"      
##  [5] "crash_hour"              "crash_time_of_day"      
##  [7] "latitude"                "longitude"              
##  [9] "beat_of_occurrence"      "address"                
## [11] "posted_speed_limit"      "traffic_control_device" 
## [13] "device_condition"        "weather_condition"      
## [15] "lighting_condition"      "trafficway_type"        
## [17] "alignment"               "roadway_surface_cond"   
## [19] "road_defect"             "first_crash_type"       
## [21] "prim_contributory_cause" "sec_contributory_cause" 
## [23] "num_units"               "has_injuries"           
## [25] "has_fatal"               "crash_type"             
## [27] "damage"                  "injuries_total"         
## [29] "injuries_fatal"
table(crash_df$posted_speed_limit)
## 
##      0      5     10     15     20     25     30     35     40     45     50 
##   3225   2313   7134  11672  12902  20291 240562  23065   3102   1870     72 
##     55     60     65     70 
##    241     12      5      3
table(round(crash_df$posted_speed_limit, -1))
## 
##      0     10     20     30     40     50     60     70 
##   5538   7134  44865 240562  28037     72    258      3
crash_df %>%
  mutate(rounded_speed_limit = (posted_speed_limit %/% 10 * 10) + round((posted_speed_limit %% 10) / 10) * 10) %>%
  count(rounded_speed_limit)
## # A tibble: 8 × 2
##   rounded_speed_limit      n
##                 <dbl>  <int>
## 1                   0   5538
## 2                  10  18806
## 3                  20  33193
## 4                  30 263627
## 5                  40   4972
## 6                  50    313
## 7                  60     17
## 8                  70      3
crash_df %>%
  mutate(rounded_speed_limit = (posted_speed_limit %/% 5 * 5) + round((posted_speed_limit %% 5) / 5) * 5) %>%
  count(rounded_speed_limit)
## # A tibble: 15 × 2
##    rounded_speed_limit      n
##                  <dbl>  <int>
##  1                   0   3225
##  2                   5   2313
##  3                  10   7134
##  4                  15  11672
##  5                  20  12902
##  6                  25  20291
##  7                  30 240562
##  8                  35  23065
##  9                  40   3102
## 10                  45   1870
## 11                  50     72
## 12                  55    241
## 13                  60     12
## 14                  65      5
## 15                  70      3
rounding_function <- function(x) {
  (floor(x / 10) * 10) + round((x %% 10) / 10) * 10
}

result <- rounding_function(6)
result
## [1] 10
crash_df %>%
  pull(beat_of_occurrence) %>%
  n_distinct()
## [1] 271
crash_df %>%
  pull(address) %>%
  n_distinct()
## [1] 170695
crash_df %>%
  group_by(latitude, longitude) %>%
  summarise(n = n())
## `summarise()` has grouped output by 'latitude'. You can override using the
## `.groups` argument.
## # A tibble: 162,486 × 3
## # Groups:   latitude [162,433]
##    latitude longitude     n
##       <dbl>     <dbl> <int>
##  1     41.6     -87.5     6
##  2     41.6     -87.6     1
##  3     41.6     -87.6     4
##  4     41.6     -87.6     2
##  5     41.6     -87.6     1
##  6     41.6     -87.6     1
##  7     41.6     -87.6     1
##  8     41.6     -87.6     1
##  9     41.6     -87.6     1
## 10     41.6     -87.6     1
## # ℹ 162,476 more rows
table(crash_df$crash_type)
## 
## INJURY AND / OR TOW DUE TO CRASH           NO INJURY / DRIVE AWAY 
##                            91008                           235461
table(as.integer(crash_df$injuries_total > 0))
## 
##      0      1 
## 279680  46789
table(as.integer(crash_df$injuries_fatal > 0))
## 
##      0      1 
## 325701    347
table(crash_df$first_crash_type)
## 
##                        ANGLE                       ANIMAL 
##                        34943                          242 
##                 FIXED OBJECT                      HEAD ON 
##                        16280                         2739 
##           OTHER NONCOLLISION                 OTHER OBJECT 
##                         1121                         3356 
##                   OVERTURNED         PARKED MOTOR VEHICLE 
##                          185                        75272 
##                 PEDALCYCLIST                   PEDESTRIAN 
##                         5158                         8530 
##                     REAR END                REAR TO FRONT 
##                        74338                         2470 
##                 REAR TO REAR                 REAR TO SIDE 
##                          560                         1539 
## SIDESWIPE OPPOSITE DIRECTION     SIDESWIPE SAME DIRECTION 
##                         4701                        48197 
##                        TRAIN                      TURNING 
##                           23                        46815
table(crash_df$trafficway_type)
## 
##                           ALLEY                CENTER TURN LANE 
##                            5506                            2897 
## DIVIDED - W/MEDIAN (NOT RAISED)      DIVIDED - W/MEDIAN BARRIER 
##                           55313                           19167 
##                        DRIVEWAY             FIVE POINT, OR MORE 
##                            1131                             351 
##                        FOUR WAY                  L-INTERSECTION 
##                           13794                              47 
##                     NOT DIVIDED                    NOT REPORTED 
##                          146295                             109 
##                         ONE-WAY                           OTHER 
##                           42347                            8752 
##                     PARKING LOT                            RAMP 
##                           22138                            1029 
##                      ROUNDABOUT                  T-INTERSECTION 
##                              85                            2896 
##                   TRAFFIC ROUTE                         UNKNOWN 
##                             259                            3127 
##       UNKNOWN INTERSECTION TYPE                  Y-INTERSECTION 
##                             872                             354
table(crash_df$prim_contributory_cause)
## 
##                                                                           ANIMAL 
##                                                                              301 
##                                           BICYCLE ADVANCING LEGALLY ON RED LIGHT 
##                                                                               42 
##                                                CELL PHONE USE OTHER THAN TEXTING 
##                                                                              471 
##                                                 DISREGARDING OTHER TRAFFIC SIGNS 
##                                                                              753 
##                                                       DISREGARDING ROAD MARKINGS 
##                                                                              459 
##                                                           DISREGARDING STOP SIGN 
##                                                                             3846 
##                                                     DISREGARDING TRAFFIC SIGNALS 
##                                                                             6600 
##                                                          DISREGARDING YIELD SIGN 
##                                                                              135 
##                                                DISTRACTION - FROM INSIDE VEHICLE 
##                                                                             2427 
##                                               DISTRACTION - FROM OUTSIDE VEHICLE 
##                                                                             1527 
##      DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.) 
##                                                                              175 
##                                                  DRIVING ON WRONG SIDE/WRONG WAY 
##                                                                             1658 
##                                              DRIVING SKILLS/KNOWLEDGE/EXPERIENCE 
##                                                                             9904 
##                                                    EQUIPMENT - VEHICLE CONDITION 
##                                                                             2242 
##                                EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST 
##                                                                              613 
##                                                 EXCEEDING AUTHORIZED SPEED LIMIT 
##                                                                             1025 
##                                              EXCEEDING SAFE SPEED FOR CONDITIONS 
##                                                                              981 
##                                           FAILING TO REDUCE SPEED TO AVOID CRASH 
##                                                                            15363 
##                                                    FAILING TO YIELD RIGHT-OF-WAY 
##                                                                            36513 
##                                                            FOLLOWING TOO CLOSELY 
##                                                                            32819 
##                                  HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE) 
##                                                                              368 
##                                                                 IMPROPER BACKING 
##                                                                            13817 
##                                                              IMPROPER LANE USAGE 
##                                                                            12358 
##                                                      IMPROPER OVERTAKING/PASSING 
##                                                                            15271 
##                                                       IMPROPER TURNING/NO SIGNAL 
##                                                                            11089 
##                                        MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT 
##                                                                               15 
##                                                                   NOT APPLICABLE 
##                                                                            17708 
##                                                            OBSTRUCTED CROSSWALKS 
##                                                                               28 
## OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER 
##                                                                             4363 
##                                                       PASSING STOPPED SCHOOL BUS 
##                                                                               47 
##                                                     PHYSICAL CONDITION OF DRIVER 
##                                                                             2105 
##                                                              RELATED TO BUS STOP 
##                                                                              151 
##                                                    ROAD CONSTRUCTION/MAINTENANCE 
##                                                                              803 
##                                         ROAD ENGINEERING/SURFACE/MARKING DEFECTS 
##                                                                              967 
##                                                                          TEXTING 
##                                                                              141 
##                                                             TURNING RIGHT ON RED 
##                                                                              230 
##                                                              UNABLE TO DETERMINE 
##                                                                           119789 
##               UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED) 
##                                                                             2006 
##                             VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.) 
##                                                                             2044 
##                                                                          WEATHER 
##                                                                             5315
table(crash_df$sec_contributory_cause)
## 
##                                                                           ANIMAL 
##                                                                              156 
##                                           BICYCLE ADVANCING LEGALLY ON RED LIGHT 
##                                                                              119 
##                                                CELL PHONE USE OTHER THAN TEXTING 
##                                                                              264 
##                                                 DISREGARDING OTHER TRAFFIC SIGNS 
##                                                                              355 
##                                                       DISREGARDING ROAD MARKINGS 
##                                                                              355 
##                                                           DISREGARDING STOP SIGN 
##                                                                              930 
##                                                     DISREGARDING TRAFFIC SIGNALS 
##                                                                             1290 
##                                                          DISREGARDING YIELD SIGN 
##                                                                               91 
##                                                DISTRACTION - FROM INSIDE VEHICLE 
##                                                                             1036 
##                                               DISTRACTION - FROM OUTSIDE VEHICLE 
##                                                                              602 
##      DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.) 
##                                                                              109 
##                                                  DRIVING ON WRONG SIDE/WRONG WAY 
##                                                                              629 
##                                              DRIVING SKILLS/KNOWLEDGE/EXPERIENCE 
##                                                                             9925 
##                                                    EQUIPMENT - VEHICLE CONDITION 
##                                                                              718 
##                                EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST 
##                                                                              173 
##                                                 EXCEEDING AUTHORIZED SPEED LIMIT 
##                                                                              787 
##                                              EXCEEDING SAFE SPEED FOR CONDITIONS 
##                                                                              790 
##                                           FAILING TO REDUCE SPEED TO AVOID CRASH 
##                                                                            13768 
##                                                    FAILING TO YIELD RIGHT-OF-WAY 
##                                                                             9389 
##                                                            FOLLOWING TOO CLOSELY 
##                                                                             8205 
##                                  HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE) 
##                                                                              405 
##                                                                 IMPROPER BACKING 
##                                                                             2553 
##                                                              IMPROPER LANE USAGE 
##                                                                             4472 
##                                                      IMPROPER OVERTAKING/PASSING 
##                                                                             4690 
##                                                       IMPROPER TURNING/NO SIGNAL 
##                                                                             3237 
##                                        MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT 
##                                                                               31 
##                                                                   NOT APPLICABLE 
##                                                                           137441 
##                                                            OBSTRUCTED CROSSWALKS 
##                                                                               45 
## OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER 
##                                                                             2261 
##                                                       PASSING STOPPED SCHOOL BUS 
##                                                                               46 
##                                                     PHYSICAL CONDITION OF DRIVER 
##                                                                             1067 
##                                                              RELATED TO BUS STOP 
##                                                                              175 
##                                                    ROAD CONSTRUCTION/MAINTENANCE 
##                                                                              427 
##                                         ROAD ENGINEERING/SURFACE/MARKING DEFECTS 
##                                                                              360 
##                                                                          TEXTING 
##                                                                               63 
##                                                             TURNING RIGHT ON RED 
##                                                                              120 
##                                                              UNABLE TO DETERMINE 
##                                                                           113655 
##               UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED) 
##                                                                              621 
##                             VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.) 
##                                                                             1166 
##                                                                          WEATHER 
##                                                                             3943
table(crash_df$traffic_control_device)
## 
##    BICYCLE CROSSING SIGN              DELINEATORS  FLASHING CONTROL SIGNAL 
##                       14                       98                      111 
##         LANE USE MARKING              NO CONTROLS               NO PASSING 
##                      752                   186811                       14 
##                    OTHER  OTHER RAILROAD CROSSING          OTHER REG. SIGN 
##                     2039                       71                      374 
##       OTHER WARNING SIGN PEDESTRIAN CROSSING SIGN           POLICE/FLAGMAN 
##                      316                      195                      126 
##   RAILROAD CROSSING GATE         RR CROSSING SIGN              SCHOOL ZONE 
##                      195                       38                      111 
##        STOP SIGN/FLASHER           TRAFFIC SIGNAL                  UNKNOWN 
##                    33451                    90908                    10379 
##                    YIELD 
##                      466
summary(crash_df$injuries_total)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.197   0.000  21.000
# Week day mapping
week_days <- c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")

# Weekday/Weekend mapping
is_weekday <- c("Weekend", "Weekday")

# Assuming crash_df is your dataframe
par(mfrow=c(3, 1))  # Set the layout to 3 rows and 1 column
for(year in unique(crash_df$crash_year)) {
  hist(subset(crash_df, crash_year == year)$crash_hour, 
       breaks = 24,
       main = paste("Crashes by Hour - Year", year),
       xlab = "Hour",
       ylab = "Frequency",
       col = "lightblue",
       xlim = c(0, 24),
       ylim = c(0, max(table(crash_df$crash_hour))),
       axes = FALSE)
  axis(1, at=seq(0, 24, by=2))  # Add x-axis labels every 2 hours
  axis(2)  # Add y-axis
}

# Filter dataframe for crashes with injuries
crashes_with_injuries <- subset(crash_df, has_injuries == 1)

# Plot histograms by crash_year
ggplot(crashes_with_injuries, aes(x = crash_hour)) +
  geom_histogram(binwidth = 1) +
  facet_wrap(~ crash_year, nrow = 3) +
  theme_minimal() +
  labs(x = "Crash Hour", y = "Count") +
  theme(legend.position = "none")

z <- cut(crash_df$crash_hour, breaks = c(-Inf, 6, 12, 18, Inf), labels = c('overnight', 'morning', 'mid_day', 'evening'))

head(z)
## [1] evening   morning   overnight morning   overnight overnight
## Levels: overnight morning mid_day evening
table(z)
## z
## overnight   morning   mid_day   evening 
##     37765     97575    135325     55804
plot(z)

sum(is.na(crash_df$crash_hour))
## [1] 0
sum(is.na(z))
## [1] 0
summary(crash_df$crash_hour)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     9.0    14.0    13.2    17.0    23.0
crash_agg <- crash_df %>%
  group_by(crash_year, crash_month, crash_hour) %>%
  summarise(injuries_total = sum(injuries_total, na.rm = TRUE),
            injuries_fatal = sum(injuries_fatal, na.rm = TRUE)) %>%
  mutate(across(c(injuries_total, injuries_fatal), ~ifelse(is.na(.), 0, .))) %>%
  ungroup()
## `summarise()` has grouped output by 'crash_year', 'crash_month'. You can
## override using the `.groups` argument.
crash_pivot <- crash_agg %>%
  pivot_wider(names_from = crash_year, 
              values_from = c(injuries_total, injuries_fatal),
              values_fn = sum,
              names_sep = "_")

# Aggregate data
crash_agg <- crash_df %>%
  group_by(crash_year, crash_month, crash_time_of_day) %>%
  summarise(has_injuries = sum(has_injuries),
            has_fatal = sum(has_fatal),
            injuries_total = sum(injuries_total),
            injuries_fatal = sum(injuries_fatal)) %>%
  mutate_at(vars(has_injuries, has_fatal, injuries_total, injuries_fatal), ~ ifelse(is.na(.), 0, .))
## `summarise()` has grouped output by 'crash_year', 'crash_month'. You can
## override using the `.groups` argument.
# Plotting
ggplot(crash_agg, aes(x = crash_month, y = has_injuries, color = crash_time_of_day)) +
  geom_line() +
  facet_wrap(~ crash_year, ncol = 3) +
  labs(title = "Chicago Crashes - Time of Day Trends", y = "Number of Injuries") +
  theme_bw() +
  scale_x_continuous(breaks = 1:12)

# Aggregate data
crash_agg <- crash_df %>%
  group_by(crash_year, crash_month, crash_day_of_week) %>%
  summarise(has_injuries = sum(has_injuries),
            injuries_total = sum(injuries_total),
            injuries_fatal = sum(injuries_fatal)) %>%
  mutate_at(vars(has_injuries, injuries_total, injuries_fatal), ~ ifelse(is.na(.), 0, .))
## `summarise()` has grouped output by 'crash_year', 'crash_month'. You can
## override using the `.groups` argument.
# Plotting
ggplot(crash_agg, aes(x = crash_month, y = has_injuries, fill = factor(crash_day_of_week))) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ crash_year, ncol = 3) +
  theme_minimal() +
  labs(x = "Month", y = "Total Injuries", title = "Total Injuries by Month and Day of Week") +
  scale_fill_brewer(palette = "Set1") +
  guides(fill = guide_legend(title = "Day of Week")) +
  scale_x_continuous(breaks = 1:12)

sum(crash_df$weather_condition == 'OTHER', na.rm = TRUE)
## [1] 1072
# Grouping and aggregating
crash_agg <- crash_df %>%
  group_by(crash_year, crash_month, lighting_condition) %>%
  summarise(has_injuries = sum(has_injuries, na.rm = TRUE),
            injuries_total = sum(injuries_total, na.rm = TRUE),
            injuries_fatal = sum(injuries_fatal, na.rm = TRUE)) %>%
  mutate_at(vars(has_injuries, injuries_total, injuries_fatal), ~ ifelse(is.na(.), 0, .)) %>%
  ungroup() %>%
  filter(lighting_condition != "UNKNOWN")
## `summarise()` has grouped output by 'crash_year', 'crash_month'. You can
## override using the `.groups` argument.
# Creating FacetGrid plot
ggplot(crash_agg, aes(x = crash_month, y = has_injuries, col = lighting_condition)) +
  geom_line() +
  facet_wrap(~ crash_year, ncol = 3) +
  labs(title = "Chicago Crashes - Lighting Condition Trends", y = "Has Injuries") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))+
  scale_x_continuous(breaks = 1:12)

crash_agg <- crash_df %>%
  group_by(crash_year, crash_month, weather_condition) %>%
  summarise(has_injuries = sum(has_injuries),
            injuries_total = sum(injuries_total),
            injuries_fatal = sum(injuries_fatal)) %>%
  mutate(across(c(has_injuries, injuries_total, injuries_fatal), ~ifelse(is.na(.), 0, .))) %>%
  ungroup()
## `summarise()` has grouped output by 'crash_year', 'crash_month'. You can
## override using the `.groups` argument.
ggplot(crash_agg, aes(x = crash_month, y = has_injuries, color = weather_condition)) +
  geom_line() +
  facet_wrap(~crash_year, ncol = 3) +
  labs(title = "Chicago Crashes - Weather Condition Trends", y = "Total Injuries") +
  theme(plot.title = element_text(hjust = 0.5))+
  scale_x_continuous(breaks = 1:12)

# Grouping and aggregating
crash_agg <- crash_df %>%
  group_by(crash_year, crash_month, weather_condition) %>%
  summarise(has_injuries = sum(has_injuries),
            injuries_total = sum(injuries_total),
            injuries_fatal = sum(injuries_fatal)) %>%
  mutate_at(vars(has_injuries, injuries_total, injuries_fatal), ~ ifelse(is.na(.), 0, .)) %>%
  filter(!weather_condition %in% c("CLEAR")) %>%
  ungroup()
## `summarise()` has grouped output by 'crash_year', 'crash_month'. You can
## override using the `.groups` argument.
# Creating facet grid
ggplot(crash_agg, aes(x = crash_month, y = has_injuries, color = weather_condition)) +
  geom_line() +
  facet_wrap(~crash_year, ncol = 3) +
  labs(title = "Chicago Crashes - Weather Condition Trends (Excluding CLEAR)", y = "Has Injuries") +
  theme(plot.title = element_text(face = "bold"))+
  scale_x_continuous(breaks = 1:12)

# Create a copy of the dataframe
crash_sdf <- crash_df

# Select columns to be standardized
col_names <- c('injuries_total', 'injuries_fatal')
features <- crash_sdf[col_names]

# Standardize the selected columns
scaled_features <- scale(features)

# Update the original dataframe with standardized values
crash_sdf[col_names] <- scaled_features

# Display the first 5 rows of the updated dataframe
head(crash_sdf, 5)
## # A tibble: 5 × 32
##   crash_date          crash_year crash_month crash_day_of_week crash_hour
##   <dttm>                   <dbl>       <int>             <int>      <int>
## 1 2019-08-18 19:27:00       2019           8                 1         19
## 2 2018-07-27 10:18:00       2018           7                 6         10
## 3 2020-05-22 04:00:00       2020           5                 6          4
## 4 2019-06-11 08:40:00       2019           6                 3          8
## 5 2020-01-07 05:45:00       2020           1                 3          5
## # ℹ 27 more variables: crash_time_of_day <fct>, latitude <dbl>,
## #   longitude <dbl>, beat_of_occurrence <int>, address <chr>, street_no <chr>,
## #   street_direction <chr>, street_name <chr>, posted_speed_limit <dbl>,
## #   traffic_control_device <chr>, device_condition <chr>,
## #   weather_condition <chr>, lighting_condition <chr>, trafficway_type <chr>,
## #   alignment <chr>, roadway_surface_cond <chr>, road_defect <chr>,
## #   first_crash_type <chr>, prim_contributory_cause <chr>, …
# Plot Map
crash_agg <- crash_df %>%
  group_by(longitude, latitude) %>%
  summarise(crashes = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'longitude'. You can override using the
## `.groups` argument.
head(crash_agg)
## # A tibble: 6 × 3
##   longitude latitude crashes
##       <dbl>    <dbl>   <int>
## 1     -87.9     42.0       1
## 2     -87.9     42.0       1
## 3     -87.9     42.0      23
## 4     -87.9     42.0       1
## 5     -87.9     42.0       4
## 6     -87.9     42.0       5
#  Create a copy of crash_df
crash_df_ <- crash_df

# Group by 'longitude', 'latitude', and 'crash_year', then summarize
crash_df_ <- crash_df_ %>%
  group_by(longitude, latitude, crash_year) %>%
  summarize(
    crashes = n(),
    has_injuries = max(has_injuries),
    is_weekday = max(is_weekday)  # is_weekday is a binary indicator
  ) %>%
  ungroup() %>%
  filter(crashes > 0)
## `summarise()` has grouped output by 'longitude', 'latitude'. You can override
## using the `.groups` argument.
head(crash_df_)
## # A tibble: 6 × 6
##   longitude latitude crash_year crashes has_injuries is_weekday
##       <dbl>    <dbl>      <dbl>   <int>        <int> <chr>     
## 1     -87.9     42.0       2020       1            0 Weekend   
## 2     -87.9     42.0       2019       1            0 Weekend   
## 3     -87.9     42.0       2019       8            1 Weekend   
## 4     -87.9     42.0       2020      15            1 Weekend   
## 5     -87.9     42.0       2019       1            1 Weekend   
## 6     -87.9     42.0       2019       2            0 Weekend
# Filter data for crash_year == 2018
crash_df_2018 <- subset(crash_df_, crash_year == 2018)

# Create scatter plot - 2018
ggplot(crash_df_2018, aes(x = longitude, y = latitude, size = crashes, color = has_injuries)) +
  geom_point(alpha = 0.5) +                # Add points with transparency
  scale_size_continuous(range = c(0, 5)) + # Adjust the size range
  labs(x = NULL, y = NULL, title = "Crashes in Chicago 2018") +  # Remove axis labels and set title
  theme_minimal() +  # Minimal theme
  theme(axis.text = element_blank(),  # Remove axis text
        axis.title = element_blank(),  # Remove axis title
        plot.title = element_text(hjust = 0.5))  # Center plot title

# Filter data for crash_year == 2019
crash_df_2019 <- subset(crash_df_, crash_year == 2019)

# Create scatter plot - 2019
ggplot(crash_df_2019, aes(x = longitude, y = latitude, size = crashes, color = has_injuries)) +
  geom_point(alpha = 0.5) +                # Add points with transparency
  scale_size_continuous(range = c(0, 5)) + # Adjust the size range
  labs(x = NULL, y = NULL, title = "Crashes in Chicago 2019") +  # Remove axis labels and set title
  theme_minimal() +  # Minimal theme
  theme(axis.text = element_blank(),  # Remove axis text
        axis.title = element_blank(),  # Remove axis title
        plot.title = element_text(hjust = 0.5))  # Center plot title

# Filter data for crash_year == 2020
crash_df_2020 <- subset(crash_df_, crash_year == 2020)

# Create scatter plot - 2019
ggplot(crash_df_2020, aes(x = longitude, y = latitude, size = crashes, color = has_injuries)) +
  geom_point(alpha = 0.5) +                # Add points with transparency
  scale_size_continuous(range = c(0, 5)) + # Adjust the size range
  labs(x = NULL, y = NULL, title = "Crashes in Chicago 2020") +  # Remove axis labels and set title
  theme_minimal() +  # Minimal theme
  theme(axis.text = element_blank(),  # Remove axis text
        axis.title = element_blank(),  # Remove axis title
        plot.title = element_text(hjust = 0.5))  # Center plot title